{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "#import \n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import tensorflow as tf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "#importing the training dataset\n",
    "data=pd.read_csv('train.csv')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Function to normalize the dataset\n",
    "def feature_normalize(dataset):\n",
    "    mu=np.mean(dataset,axis=0)\n",
    "    sigma=np.std(dataset,axis=0)\n",
    "    return (dataset-mu)/sigma"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Function to convert string type data to integers\n",
    "\n",
    "def str_to_int(dataset):\n",
    "    string_columns=dataset.select_dtypes(['object']).columns\n",
    "    print(string_columns)\n",
    "    \n",
    "    for col in string_columns:\n",
    "        dataset[col]=dataset.astype('category')\n",
    "        \n",
    "    categorical_columns=dataset.select_dtypes(['category']).columns\n",
    "    dataset[categorical_columns]=dataset[categorical_columns].apply(lambda x:x.cat.codes)\n",
    "    return dataset\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "def one_hot(df, cols):\n",
    "    \"\"\"\n",
    "    @param df pandas DataFrame\n",
    "    @param cols a list of columns to encode \n",
    "    @return a DataFrame with one-hot encoding\n",
    "    \"\"\"\n",
    "    for each in cols:\n",
    "        dummies = pd.get_dummies(df[each], prefix=each, drop_first=False)\n",
    "        del df[each]\n",
    "        df = pd.concat([df, dummies], axis=1)\n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Delete the following columns\n",
    "new_data=data.drop(labels=[\"Name\",\"Ticket\",\"Cabin\",\"Parch\"],axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Preprocessing data\n",
    "\n",
    "def preprocessing(df):\n",
    "    \n",
    "    #Replace the nan with mean value\n",
    "    df[\"Age\"]=df.fillna(df[\"Age\"].mean(),inplace=True)\n",
    "    \n",
    "    #Onehot encoding Pcclass\n",
    "    df=one_hot(df,df.loc[:,[\"Pclass\"]].columns)\n",
    "    \n",
    "    #String to integer\n",
    "    df=str_to_int(df)\n",
    "    \n",
    "    #age normalization\n",
    "    df[\"Age\"]=feature_normalize(df[\"Age\"])\n",
    "    \n",
    "    #stats.describe(df).variance\n",
    "    \n",
    "    return df\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "from scipy import stats\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Index(['Sex', 'Age', 'Embarked'], dtype='object')\n"
     ]
    }
   ],
   "source": [
    "df_train=preprocessing(new_data)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "   PassengerId  Survived  Pclass     Sex   Age  SibSp     Fare Embarked\n",
      "0            1         0       3    male  22.0      1   7.2500        S\n",
      "1            2         1       1  female  38.0      1  71.2833        C\n",
      "2            3         1       3  female  26.0      0   7.9250        S\n",
      "3            4         1       1  female  35.0      1  53.1000        S\n",
      "4            5         0       3    male  35.0      0   8.0500        S\n",
      "5            6         0       3    male   0.0      0   8.4583        Q\n",
      "6            7         0       1    male  54.0      0  51.8625        S\n",
      "7            8         0       3    male   2.0      3  21.0750        S\n",
      "8            9         1       3  female  27.0      0  11.1333        S\n",
      "9           10         1       2  female  14.0      1  30.0708        C\n"
     ]
    }
   ],
   "source": [
    "print(new_data.head(10))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(891, 5)\n"
     ]
    }
   ],
   "source": [
    "\n",
    "\n",
    "features=df_train.iloc[:,2:7]\n",
    "print(features.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(891, 8)\n"
     ]
    }
   ],
   "source": [
    "features=df_train.iloc[:,2:].values\n",
    "print(features.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(891, 1)\n"
     ]
    }
   ],
   "source": [
    "labels=df_train.iloc[:,:1].values\n",
    "print(labels.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Working with test data\n",
    "test_data=pd.read_csv(\"test.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [],
   "source": [
    "new_test_data=test_data.drop(labels=[\"Name\",\"Ticket\",\"Cabin\",\"Parch\"],axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Index(['Sex', 'Age', 'Embarked'], dtype='object')\n"
     ]
    }
   ],
   "source": [
    "test_features=preprocessing(new_test_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "   PassengerId  Pclass     Sex   Age  SibSp     Fare Embarked\n",
      "0          892       3    male  34.5      0   7.8292        Q\n",
      "1          893       3  female  47.0      1   7.0000        S\n",
      "2          894       2    male  62.0      0   9.6875        Q\n",
      "3          895       3    male  27.0      0   8.6625        S\n",
      "4          896       3  female  22.0      1  12.2875        S\n",
      "5          897       3    male  14.0      0   9.2250        S\n",
      "6          898       3  female  30.0      0   7.6292        Q\n",
      "7          899       2    male  26.0      1  29.0000        S\n",
      "8          900       3  female  18.0      0   7.2292        C\n",
      "9          901       3    male  21.0      2  24.1500        S\n"
     ]
    }
   ],
   "source": [
    "print(new_test_data.head(10))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Encoding Sex data\n",
    "new_test_data['Sex'] = [1 if item == 'male' else 0 for item in new_test_data['Sex']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [],
   "source": [
    "#one hot encoding Pclass\n",
    "new_test_data=one_hot(new_test_data,new_test_data.loc[:,[\"Pclass\"]].columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [],
   "source": [
    "Xtest=new_test_data.iloc[:,1:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [],
   "source": [
    "Xtest=Xtest.values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(418, 8)\n"
     ]
    }
   ],
   "source": [
    "print(Xtest.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
